package cn.goour.web.base.tools

import java.io.UnsupportedEncodingException
import java.util.*
import kotlin.experimental.and

object StringSimilar {
    fun getSimilarity(doc1: String, doc2: String): Double {
        if (doc1.isNotBlank() && doc2.isNotBlank()) {

            val AlgorithmMap = HashMap<Int, IntArray>()

            //将两个字符串中的中文字符以及出现的总数封装到，AlgorithmMap中
            for (i in 0 until doc1.length) {
                val d1 = doc1[i]
                if (isHanZi(d1)) {
                    val charIndex = getGB2312Id(d1).toInt()
                    if (charIndex != -1) {
                        var fq: IntArray? = AlgorithmMap[charIndex]
                        if (fq != null && fq.size == 2) {
                            fq[0]++
                        } else {
                            fq = IntArray(2)
                            fq[0] = 1
                            fq[1] = 0
                            AlgorithmMap.put(charIndex, fq)
                        }
                    }
                }
            }

            for (i in 0 until doc2.length) {
                val d2 = doc2[i]
                if (isHanZi(d2)) {
                    val charIndex = getGB2312Id(d2).toInt()
                    if (charIndex != -1) {
                        var fq: IntArray? = AlgorithmMap[charIndex]
                        if (fq != null && fq.size == 2) {
                            fq[1]++
                        } else {
                            fq = IntArray(2)
                            fq[0] = 0
                            fq[1] = 1
                            AlgorithmMap.put(charIndex, fq)
                        }
                    }
                }
            }

            val iterator = AlgorithmMap.keys.iterator()
            var sqdoc1 = 0.0
            var sqdoc2 = 0.0
            var denominator = 0.0
            AlgorithmMap.forEach { _, u ->
                denominator += (u[0] * u[1]).toDouble()
                sqdoc1 += (u[0] * u[0]).toDouble()
                sqdoc2 += (u[1] * u[1]).toDouble()
            }
            return denominator / Math.sqrt(sqdoc1 * sqdoc2)
        } else {
            throw NullPointerException(
                    " the Document is null or have not cahrs!!")
        }
    }

    /**
     * 判断是否汉字
     */
    private fun isHanZi(ch: Char) = ch.toInt() in 0x4E00..0x9FA5

    /**
     * 根据输入的Unicode字符，获取它的GB2312编码或者ascii编码，
     *
     * @param ch
     * 输入的GB2312中文字符或者ASCII字符(128个)
     * @return ch在GB2312中的位置，-1表示该字符不认识
     */
    private fun getGB2312Id(ch: Char): Short {
        try {
            val buffer = Character.toString(ch).toByteArray(charset("GB2312"))
            if (buffer.size != 2) {
                // 正常情况下buffer应该是两个字节，否则说明ch不属于GB2312编码，故返回'?'，此时说明不认识该字符
                return -1
            }
            val b0 = (buffer[0].and(0x0FF.toByte())).toInt() - 161 // 编码从A1开始，因此减去0xA1=161
            val b1 = (buffer[1] and (0x0FF.toByte())).toInt() - 161 // 第一个字符和最后一个字符没有汉字，因此每个区只收16*6-2=94个汉字
            return (b0 * 94 + b1).toShort()
        } catch (e: UnsupportedEncodingException) {
            e.printStackTrace()
        }

        return -1
    }
}

/*
fun main(args: Array<String>) {
    println(StringSimilar.getSimilarity("今天的天气真好啊","今天的天气真好"))
}*/
